Forecast use of a city bikeshare system

Bike sharing systems are a means of renting bicycles where the process of obtaining membership, rental, and bike return is automated via a network of kiosk locations throughout a city. Using these systems, people are able rent a bike from a one location and return it to a different place on an as-needed basis. Currently, there are over 500 bike-sharing programs around the world.

The data generated by these systems makes them attractive for researchers because the duration of travel, departure location, arrival location, and time elapsed is explicitly recorded. Bike sharing systems therefore function as a sensor network, which can be used for studying mobility in a city. In this competition, participants are asked to combine historical usage patterns with weather data in order to forecast bike rental demand in the Capital Bikeshare program in Washington, D.C.

You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period.

Data Fields

## Load Packages ##
library(ggplot2) # Graphics
library(Metrics) # For the Root Mean Squared Log Error (RMSLE) function
library(randomForest) # Random Forest

Load the training data from the Bike Sharing data set.

## Load Training Data ##
trainRaw <- read.csv('train.csv', header = TRUE, stringsAsFactors = FALSE)

To optimize our predictions, we need to engineer some of the built in data. First, we will modfiy $datetime to have year, month, weekday, and hour.

## Feature Engineering Function ##
addFeatures <- function(df){
  names <- c("season","holiday","workingday","weather")
  df[,names] <- lapply(df[,names],factor)
  
  df$datetime <- as.character(df$datetime)
  df$datetime <- strptime(df$datetime, format="%Y-%m-%d %T", tz = "EST")
  
  df$hour <- as.integer(substr(df$datetime, 12, 13))
  df$hour <- as.factor(df$hour)
  
  df$weekday <- as.factor(weekdays(df$datetime))
  df$weekday <- factor(df$weekday, 
                       levels = c("Monday",
                                  "Tuesday",
                                  "Wednesday",
                                  "Thursday",
                                  "Friday",
                                  "Saturday",
                                  "Sunday"))

  df$month <- as.integer(substr(df$datetime,6,7))
  df$month <- as.factor(df$month)
  
  df$year <- as.integer(substr(df$datetime,1,4))
  df$year <- as.factor(df$year)

  #df$yearmonth<-as.factor(format(df$datetime,"%Y%m",tz="EST"))
  df <- subset(df, select = -c(datetime))
  
  df$weather[df$weather == 4] <- 3
  
  return(df)
}

Apply the feature engineering function to the training data:

trainData <- addFeatures(trainRaw)

Lets see what we are working with by using head(trainData).

##   season holiday workingday weather temp  atemp humidity windspeed casual
## 1      1       0          0       1 9.84 14.395       81    0.0000      3
## 2      1       0          0       1 9.02 13.635       80    0.0000      8
## 3      1       0          0       1 9.02 13.635       80    0.0000      5
## 4      1       0          0       1 9.84 14.395       75    0.0000      3
## 5      1       0          0       1 9.84 14.395       75    0.0000      0
## 6      1       0          0       2 9.84 12.880       75    6.0032      0
##   registered count hour  weekday month year
## 1         13    16    0 Saturday     1 2011
## 2         32    40    1 Saturday     1 2011
## 3         27    32    2 Saturday     1 2011
## 4         10    13    3 Saturday     1 2011
## 5          1     1    4 Saturday     1 2011
## 6          1     1    5 Saturday     1 2011

Looking at the fields, we want to determine which have a correlation to bike usage.

## Year Data Visualization
YearCasual = aggregate(casual ~ hour + year, data = trainData, FUN = "mean")
YearRegistered = aggregate(registered ~ hour + year, data = trainData, FUN = "mean")
par(mfrow=c(1,2))
ggplot(YearCasual, aes(x = hour, y = casual)) + 
  geom_line(aes(group = year, color = year), size = 2, alpha = 0.5)

ggplot(YearRegistered, aes(x = hour, y = registered)) + 
  geom_line(aes(group = year, color = year), size = 2, alpha = 0.5)

Clearly, ridership increased from 2011 to 2012. This needs to be taken into account when calculating predictions.

## Weekday Data Visualization
WeekHourCount = aggregate(count ~ hour + weekday, data = trainData, FUN = "mean")
ggplot(WeekHourCount, aes(x=hour, y=count)) + 
  geom_line(aes(group=weekday, color=weekday),size=2,alpha=0.5)

ggplot(WeekHourCount, aes(x=hour, y=weekday)) + 
  geom_tile(aes(fill = count))+ 
  scale_fill_gradient(low="blue", high="red")

WeekHourCasual = aggregate(casual ~ hour + weekday, data = trainData, FUN = "mean")
ggplot(WeekHourCasual, aes(x=hour, y=casual)) + 
  geom_line(aes(group=weekday, color=weekday),size=2,alpha=0.5)

ggplot(WeekHourCasual, aes(x=hour, y=weekday)) + 
  geom_tile(aes(fill = casual)) + 
  scale_fill_gradient(low="blue", high="red")

WeekHourRegistered = aggregate(registered ~ hour + weekday, data = trainData, FUN = "mean")
ggplot(WeekHourRegistered, aes(x=hour, y=registered)) + 
  geom_line(aes(group=weekday, color=weekday),size=2,alpha=0.5)

ggplot(WeekHourRegistered, aes(x=hour, y=weekday)) + 
  geom_tile(aes(fill = registered)) + 
  scale_fill_gradient(low="blue", high="red")

Weekend vs weekday a strong correlation to the rider counts between casual and regestered. We can simplify the modeling to use the single flag of workingday instead of correlating to weekday.

## Holiday Data Visualization
HolidayCasual = aggregate(casual ~ hour + holiday, data = trainData, FUN = "mean")
ggplot(HolidayCasual, aes(x = hour, y = casual)) + 
  geom_line(aes(group = holiday, color = holiday), size = 2, alpha = 0.5)

HolidayRegistered = aggregate(registered ~ hour + holiday, data = trainData, FUN = "mean")
ggplot(HolidayRegistered, aes(x = hour, y = registered)) + 
  geom_line(aes(group = holiday, color = holiday), size = 2, alpha = 0.5)

Weather indicates strong correlation (logically).

## Season Data Visualization
SeasonCasual = aggregate(casual ~ hour + season, data = trainData, FUN = "mean")
ggplot(SeasonCasual, aes(x= hour, y = casual)) + 
  geom_line(aes(group = season, color = season), size = 2, alpha = 0.5)

SeasonRegistered = aggregate(registered ~ hour + season , data = trainData, FUN = "mean")
ggplot(SeasonRegistered, aes(x= hour, y = registered)) + 
  geom_line(aes(group = season, color = season), size = 2, alpha = 0.5)

## Weather Data Visualization
WeatherCasual = aggregate(casual ~ hour + weather, data = trainData, FUN = "mean")
ggplot(WeatherCasual, aes(x= hour, y = casual)) + 
  geom_line(aes(group = weather, color = weather), size = 2, alpha = 0.5)

WeatherRegistered = aggregate(registered ~ hour + weather , data = trainData, FUN = "mean")
ggplot(WeatherRegistered, aes(x= hour, y = registered)) + 
  geom_line(aes(group = weather, color = weather), size = 2, alpha = 0.5)

## Temperature Visulization
ggplot(trainData, aes(x = temp, y = casual)) + geom_jitter(aes(group = season, color = season))

ggplot(trainData, aes(x = temp, y = registered)) + geom_jitter(aes(group = season, color = season))

ggplot(trainData, aes(x = atemp, y = casual)) + geom_jitter(aes(group = season, color = season))

ggplot(trainData, aes(x = atemp, y = registered)) + geom_jitter(aes(group = season, color = season))

## Humidity Visulization
ggplot(trainData, aes(x = humidity, y = casual)) + geom_jitter(aes(group = season, color = season))

ggplot(trainData, aes(x = humidity, y = registered)) + geom_jitter(aes(group = season, color = season))

## Temperature and Humidity correlation with season and weather 
ggplot(trainData, aes(x = humidity)) + geom_density(aes(group = season, color = season), size = 2, alpha = 0.5)

ggplot(trainData, aes(x = humidity)) + geom_density(aes(group = weather, color = weather), size = 2, alpha = 0.5)

ggplot(trainData, aes(x = atemp)) + geom_density(aes(group = season, color = season), size = 2, alpha = 0.5)

ggplot(trainData, aes(x = atemp)) + geom_density(aes(group = weather, color = weather), size = 2, alpha = 0.5)

## Windspeed Visulization
ggplot(trainData, aes(x = windspeed, y = casual)) + geom_jitter(aes(group = season, color = season))

ggplot(trainData, aes(x = windspeed, y = registered)) + geom_jitter(aes(group = season, color = season))

Our prediction method will be a Random Forest

## Prediction Algorithm
ind <- sample(c(1:nrow(trainData)),1000) # random 1/4 sample, 3/4 to train, 1/4 to test the method on
trainSubset <- trainData[-ind,]
trainTest  <- trainData[ind,]

trainCasual <- subset(trainSubset, select = -c(count,registered))
trainRegistered <- subset(trainSubset, select = -c(count,casual))
rfNtree <- 500
rfMtry  <- 5
rfImportance <- TRUE
set.seed(21)
#casualFit <- randomForest(casual ~., data = trainCasual, 
#                          ntree = rfNtree, mtry = rfMtry, importance = rfImportance)
#varImpPlot(casualFit)
#registeredFit <- randomForest(registered ~., data = trainRegistered, 
#                          ntree = rfNtree, mtry = rfMtry, importance = rfImportance)
#varImpPlot(registeredFit)

Casual Fit, in order of %IncMSE: Hour, year, humidity, temp, atemp, month, workingday, weather, windspeed, casualdayflag, weekday, holiday, season Registered Fit: Hour, year, weather, month, workingday, humidity, season, weekday, temp, atemp, casualdayflag, windspeed, holiday

casualFormula <- casual ~ hour + year + humidity + temp + atemp + workingday + weekday
casualFit <- randomForest(casualFormula, data = trainData, 
                          ntree = rfNtree, mtry = rfMtry, importance = rfImportance)
registeredFormula <- registered ~ hour + year + humidity + temp + atemp + workingday + weekday
registeredFit <- randomForest(registeredFormula, data = trainData, 
                          ntree = rfNtree, mtry = rfMtry, importance = rfImportance)
## Evaluate Fit to Subset
trainTest$casualpred <- predict(casualFit,trainTest)
errCasualTest = rmsle(trainTest$casual, trainTest$casualpred)
errCasualTest
## [1] 0.3132188
trainTest$registeredpred <- predict(registeredFit,trainTest)
errRegisteredTest = rmsle(trainTest$registered, trainTest$registeredpred)
errRegisteredTest
## [1] 0.2219326
trainTest$countpred <- round(trainTest$casualpred + trainTest$registeredpred,0)
errCountTest = rmsle(trainTest$count, trainTest$countpred)
errCountTest
## [1] 0.2178348
## Evaluate Fit to Training Data
trainData$casualpred <- predict(casualFit,trainData)
errCasualTrain = rmsle(trainData$casual, trainData$casualpred)
errCasualTrain
## [1] 0.3347693
trainData$registeredpred <- predict(registeredFit,trainData)
errRegisteredTrain = rmsle(trainData$registered, trainData$registeredpred)
errRegisteredTrain
## [1] 0.2318618
trainData$countpred <- round(trainData$casualpred + trainData$registeredpred,0)
errCountTrain = rmsle(trainData$count, trainData$countpred)
errCountTrain
## [1] 0.2304273
testRaw <- read.csv('test.csv', header = TRUE, stringsAsFactors = FALSE)
testData <- addFeatures(testRaw)
testData$casual <- predict(casualFit,testData)
testData$registered <- predict(registeredFit,testData)
testData$count <- round(testData$casual + testData$registered,0)
submit <- data.frame(datetime = testRaw$datetime, count = testData$count)
write.csv(submit,file = "bikeshare_randomForest_submission.csv", row.names = FALSE)